###############################################
### Project: Communicating in an eventful   ###
###          campaign                       ###
### Task:    Preparation - GLES data        ###
### Title:   04_Preparation_GLES.R          ###
###############################################

#---------------------------------------------------------------------------------
# Description:
#
# This script prepares the responses to the open-ended questions from GLES survey
# data (GLES Panel, Wave 16-19) for further use during the analysis.
#
# Attention: To run the script users need to provide the raw GLES data available
# via GESIS. The links are provided below.
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Preparation

## load packages
library(quanteda)
library(tidyverse)
library(newsmap)
library(haven)
library(xlsx)
library(irr)

#---------------------------------------------------------------------------------------------------------------------

# Load GLES survey data

# The following chunk of code loads the GLES panel survey, waves 16-19. Please download the files 
# containing the responses to the open-ended survey questions. The data sets are available via GESIS:
# 
# - Wave 16: https://search.gesis.org/research_data/ZA7722
# - Wave 17: https://search.gesis.org/research_data/ZA7723
# - Wave 18: https://search.gesis.org/research_data/ZA7724
# - Wave 19: https://search.gesis.org/research_data/ZA7725

## load panel wave 16
data_wave16 <- read.csv2("ZA7722_v2-0-0_open-ended.csv")
ZA7722_v2_0_0 <- read_dta("ZA7722_v2-0-0.dta")

data_wave16 <- data.frame(survey = "Panel 2021 - Wave 16", 
                          start_date = ZA7722_v2_0_0$field_start,
                          end_date = ZA7722_v2_0_0$field_end,
                          start_month = months(as.Date(ZA7722_v2_0_0$field_start)),
                          end_month = months(as.Date(ZA7722_v2_0_0$field_end)),
                          most_important_problem = data_wave16$kp16_840s)

data_wave16$start_month <- gsub("Mai", "May", data_wave16$start_month)
data_wave16$end_month <- gsub("Mai", "May", data_wave16$end_month)

## load panel wave 17
data_wave17 <- read.csv2("ZA7723_v1-0-0_open-ended.csv", )
ZA7723_v1_0_0 <- read_dta("ZA7723_v1-0-0.dta")

data_wave17 <- data.frame(survey = "Panel 2021 - Wave 17", 
                     start_date = ZA7723_v1_0_0$field_start,
                     end_date = ZA7723_v1_0_0$field_end,
                     start_month = months(as.Date(ZA7723_v1_0_0$field_start)),
                     end_month = months(as.Date(ZA7723_v1_0_0$field_end)),
                     most_important_problem = data_wave17$kp17_840s)

data_wave17$start_month <- gsub("Juli", "July", data_wave17$start_month)
data_wave17$end_month <- gsub("Juli", "July", data_wave17$end_month)

## load panel wave 18
data_wave18 <- read.csv2("ZA7724_v1-0-0_open-ended.csv")
ZA7724_v1_0_0 <- read_dta("ZA7724_v1-0-0.dta")

data_wave18 <- data.frame(survey = "Panel 2021 - Wave 18", 
                          start_date = ZA7724_v1_0_0$field_start,
                          end_date = ZA7724_v1_0_0$field_end,
                          start_month = months(as.Date(ZA7724_v1_0_0$field_start)),
                          end_month = months(as.Date(ZA7724_v1_0_0$field_end)),
                          most_important_problem = data_wave18$kp18_840s)

data_wave19 <- read.csv2("ZA7725_v1-0-0_open-ended.csv")
ZA7725_v1_0_0 <- read_dta("ZA7725_v1-0-0.dta")

## load panel wave 19
data_wave19 <- data.frame(survey = "Panel 2021 - Wave 19", 
                          start_date = ZA7725_v1_0_0$field_start,
                          end_date = ZA7725_v1_0_0$field_end,
                          start_month = months(as.Date(ZA7725_v1_0_0$field_start)),
                          end_month = months(as.Date(ZA7725_v1_0_0$field_end)),
                          most_important_problem = data_wave19$kp19_840s)

data <- rbind(data_wave16, data_wave17, data_wave18, data_wave19)

## clean text column
data$most_important_problem <- gsub("<df>", "ß", data$most_important_problem)
data$most_important_problem <- gsub("<e4>", "ä", data$most_important_problem)
data$most_important_problem <- gsub("<e6>", "ö", data$most_important_problem)
data$most_important_problem <- gsub("<fc>", "ü", data$most_important_problem)

#---------------------------------------------------------------------------------------------------------------------

# Classify open-ended questions (most important problem) into issue categories

set.seed(100)

## seed-word dictionary

## Note: Covid added to issue "Welfare_State"
dict <- dictionary(list(Agriculture = c("Landwirt*", "Agrar*", "Bauer*", "Bäuer*",
                                        "Lebensmittel*", "Ernähr*"),
                        Foreign_Affairs = c("Außenpol*", "außenpol*", "Vereinte* Nation*",
                                            "UN", "Weltbank", "IMF", "Diplomat*", "Friede*",
                                            "Entwicklungs*"),
                        European_Union = c("Europäische* Union*", "Europäische*
                                           Gemeinschaft*", "EU", "Europapol*",
                                           "Mitgliedsstaat*", "EZB", "EUGH", "Brüssel",
                                           "Europäische* Kommission*", "Europäische*
                                           Parlament*"),
                        Defense = c("Verteidigungspol*", "Militär*", "militär*",
                                    "Bundeswehr*", "Wehr*", "Grundwehr*", "Bundesheer*",
                                    "Armee*", "Truppe", "Soldat*"),
                        Freedom = c("Freiheit*", "Individualismus*", "Menschenrecht*",
                                    "Grundrecht*", "Bürgerrecht*"),
                        Democracy = c("Demokrat*", "demokrat*", "direkt* Demokrat*",
                                      "Verfassung*", "verfassung*", "Parlament*",
                                      "parlament*"),
                        Political_System = c("Föderal*", "föderal*", "subnation*",
                                             "Zentralis*", "Bürokratie*", "Beamt*",
                                             "Korrupt*", "korrupt*", "Parlament*",
                                             "parlament*"),
                        Political_Authority = c("politisch* Führung*", "Kompetenz",
                                                "Kandidat*"),
                        Economy = c("Wirtschaft*", "wirtschaft*", "Unternehm*", "Industr*",
                                    "industr*", "Steuer*", "Budget*", "Finanz*", "Markt*",
                                    "Verbraucher*", "Konsument*", "BIP", "Bank*",
                                    "Schulden*"),
                        Technology_and_Infrastructure = c("Technologie*", "Infrastruktur*",
                                                          "infrastruktur*", "Verkehr*",
                                                          "Straße*", "Autobahn*", "Bahn*",
                                                          "Digitalisierung*"),
                        Environment = c("Umwelt*", "umwelt*", "Klima*", "klima*", "Natur*",
                                        "Nachhaltig*", "nachhaltig*", "Emission*", "ökolog*",
                                        "Treibhaus*", "Feinstaub*"),
                        Culture = c("Kultur*", "Freizeit*", "Kunst*", "Künst*", "Muse*",
                                    "Bibliothek*", "Fernseh*", "Film*", "Theater*", "Musik*",
                                    "Sport*", "Bewegung*"),
                        Equality = c("Gleich*", "gleich*", "Gerechtigkeit*", "gerecht*",
                                     "Diskriminierung*", "Umverteilung*", "Armut*", "Sozial*",
                                     "sozial*"),
                        Welfare_State = c("Wohlfahrtsstaat*", "Gesundheit*", "Pfleg*",
                                          "Pension*", "Rent*", "Armut*", "Frauen*", "Kind*",
                                          "Coron*", "Covid*"),
                        Education = c("Bildung*", "Schule*", "Student*", "Studium*",
                                      "Universit*"),
                        Society_and_Values = c("Nationalis*", "nationalis*", "Rechts*",
                                               "Patriot*", "patriot*", "Geschichte*",
                                               "Kultur*", "kultur*", "Tradition*",
                                               "tradition*", "Kultur*", "Religion*",
                                               "Kirche*", "Scheidung*", "Abtreibung*",
                                               "Familie*"),
                        Immigration = c("Immigration*", "immigr*", "Migra*", "Asyl*",
                                        "Flücht*", "Integration*", "integr*", "Multikult*",
                                        "multikult*", "Ausländ*", "Einwander*", "Einheim*"),
                        Law_and_Order = c("Justiz*", "Krimin*", "krimin*", "entkrimin*",
                                          "Poliz*", "Exekutiv*", "Gewalt*", "Täter*",
                                          "Gericht*", "Richter*"),
                        Labour = c("Arbeit*", "Gehalt*", "Lohn*", "Beschäftig*", "Angestell*",
                                   "Gewerkschaft*"),
                        Error_Codes = c("-93 Interview abgebrochen", "-99 keine Angabe", "-97
                                        trifft nicht zu", "-98 weiss nicht")
                        ), tolower = TRUE)

## preprocessing

### create and clean corpus
corp <- corpus(data$most_important_problem)

corp <- gsub("ae", "ä", corp)
corp <- gsub("oe", "ö", corp)
corp <- gsub("ue", "ü", corp)

### tokenization
toks <- tokens(corp) %>%
  tokens_compound(pattern = phrase(c("vereinte* nation*", "europäische* union*", 
                                     "europäische* gemeinschaft*", "europäische* kommission*",
                                     "europäische* parlament*", "direkt* demokrat*",
                                     "politisch* führung*")))

### create dfmat_label and dfmat_feat_select
toks_label <- tokens_lookup(toks, dictionary = dict)
dfmat_label <- dfm(toks_label, tolower = FALSE)

dfmat_feat <- dfm(toks, tolower = FALSE)
dfmat_feat_select <- dfm_select(dfmat_feat, case_insensitive = TRUE)

## get textmodel
tmod_nm <- textmodel_newsmap(dfmat_feat_select, y = dfmat_label)

## predict
nm_prop <- as.data.frame(as.matrix(predict(tmod_nm, type = "all")))
nm_label <- predict(tmod_nm)

df <- cbind(data, label = as.character(nm_label), nm_prop)

df$id <- 1:nrow(df)

df <- df[, c(28, 1:7)]
rownames(df) <- NULL

# tabulate label frequencies
table(df$label)

#---------------------------------------------------------------------------------------------------------------------

# Save final data set

saveRDS(df, "GLES_Data.RDS")
